# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/rnaseq/"
# Bucket structure is: tissue/platform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/transcriptomics/"
# Specify bucket and local path for the phenotypic data
# this path is internal to BIC - faster
# pheno_bucket = "gs://bic_data_analysis/pass1b/pheno_dmaqc/merged2020-06-12/"
pheno_bucket = "gs://motrpac-internal-release2-results/pass1b-06/phenotype/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"
# specify parameters for filtering lowly expressed genes and normalization
# within a tissue keep genes with cpm >= min_cpm in at least min_num_samples samples
min_cpm = 0.5
min_num_samples = 2
norm_method="TMM"
# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
Specifiy the pipeline, metadata, and sample variables for the analysis.
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("RIN","reads",
"pct_rRNA","pct_globin",
"pct_umi_dup","median_5_3_bias",
"pct_multimapped","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.anirandgroup",
"registration.batchnumber",
"training.staffid",
"is_control",
"vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
"terminal.weight.mg","time_to_freeze",
"timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))
# Load the data
rna_seq_data = parse_rnaseq_data(bucket,local_path=local_data_dir,
remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
rnaseq_meta = rna_seq_data$sample_metadata
data_tables = rna_seq_data$data_tables
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue =
pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze =
pheno_data$viallabel_data$calculated.variables.frozetime_after_train -
pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
v = rep(0,length(x))
tps = c("Eight"=8,"Four"=4)
for(tp in names(tps)){
v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
}
return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
pheno_data$viallabel_data$key.anirandgroup
)
The RNA-seq pipeline manual of procedures (MOP) defines a flagged sample (e.g., potentially problematic) as a sample that satisfies one of the following: (1) number of read pairs \(<20M\), (2) percent GC not in \([20,80]\), (3) percent rRNA reads \(>20\), (4) percent of uniquely mapped reads \(<60\), (5) RIN \(<6\), and (6) percent coding reads + percent utr reads \(< 50\).
rnaseq_meta$pipeline_flags = rnaseq_pipeline_flagged_samples(rnaseq_meta)
mop_flagged_samples = rownames(rnaseq_meta)[nchar(rnaseq_meta$pipeline_flags)>0]
We first remove unexpressed genes using the min_cpm and min_num_samples parameters. We then use edgeR for library size correction and for TMM normalization. For each dataset (i.e., tissue) we store the normalized data table and all additional variables (e.g., clinical data) in a list called norm_rnaseq_data.
norm_rnaseq_data = list()
for(dataset in names(rna_seq_data$data_tables)){
unnorm_counts = rna_seq_data$data_tables[[dataset]]
curr_samples = as.character(colnames(unnorm_counts))
# For rat data, take samples whose label id starts with "9" - remove qc pools
curr_samples = curr_samples[grepl("^9",curr_samples)]
unnorm_counts = unnorm_counts[,curr_samples]
# exclude low count genes in the current dataset
norm_counts = edgeR_normalized_log_cpm(as.matrix(unnorm_counts),
min_cpm = min_cpm,
min_num_samples = min_num_samples,
norm_method = norm_method
)
# get the dataset metadata
curr_meta1 = rnaseq_meta[curr_samples,pipeline_qc_cols]
curr_meta2 = pheno_data$viallabel_data[curr_samples,biospec_cols]
# get site and tissue
curr_sites = paste(unique(rnaseq_meta[curr_samples,"GET_site"]),collapse=",")
curr_tissue = paste(unique(pheno_data$viallabel_data[curr_samples,"tissue"],collapse=","))
curr_name = paste(curr_sites,curr_tissue,sep=",")
if(curr_name %in% names(norm_rnaseq_data)){
print(paste("Warning: dataset",curr_name,"appears more than once, taking the last occurrence"))
}
norm_rnaseq_data[[curr_name]] = list(
norm_counts = norm_counts,
pipeline_meta = curr_meta1,dmaqc_meta = curr_meta2)
}
Examine the effect of the normalization process, which includes removing unexpressed genes. For each dataset show the boxplots of 20 randomly selected samples.
for(i in 1:length(norm_rnaseq_data)){
unnorm_data = data_tables[[i]]
norm_data = norm_rnaseq_data[[i]][["norm_counts"]]
unnorm_data = log2(unnorm_data+1)
samp = sample(1:ncol(norm_data))[1:20]
curr_name = names(norm_rnaseq_data)[i]
colnames(unnorm_data) = NULL
colnames(norm_data) = NULL
boxplot(unnorm_data[,samp],
main=paste0(curr_name,"\nlog2 raw counts (",nrow(unnorm_counts)," genes)"),
ylab = "log2 raw counts",xaxt="n")
boxplot(norm_data[,samp],
main=paste0(curr_name,"\nnorm log cpm (",nrow(norm_data)," genes)"),
ylab = "log2 cpm",xaxt="n")
}
We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.
for(currname in names(norm_rnaseq_data)){
# run PCA
curr_data = norm_rnaseq_data[[currname]][["norm_counts"]]
curr_pca = prcomp(t(curr_data),scale. = T,center = T)
curr_pcax = curr_pca$x[,1:num_pcs]
explained_var = summary(curr_pca)[["importance"]][3,5]
norm_rnaseq_data[[currname]][["pca"]] = list(
pcax = curr_pcax,explained_var=explained_var
)
# plot
df = data.frame(
PC1 = curr_pcax[,1],PC2 = curr_pcax[,2],
randgroup = norm_rnaseq_data[[currname]][["dmaqc_meta"]][,"key.anirandgroup"],
sex = as.character(norm_rnaseq_data[[currname]][["dmaqc_meta"]][,"registration.sex"]),
stringsAsFactors = F
)
df$sex[df$sex=="1"] = "F"
df$sex[df$sex=="2"] = "M"
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
ggtitle(currname)
plot(p)
}
Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.
For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.
pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(norm_rnaseq_data)){
curr_meta = cbind(norm_rnaseq_data[[currname]]$pipeline_meta,
norm_rnaseq_data[[currname]]$dmaqc_meta)
# remove the text group description
curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
# remove metadata variables with NAs
curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
# remove bid and pid
curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
# remove fields withe zero variance
curr_meta = curr_meta[,apply(curr_meta,2,sd)>0]
# take the PCA results
curr_pcax = norm_rnaseq_data[[currname]][["pca"]][["pcax"]]
corrs = cor(curr_pcax,curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_pcax,curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
ggtitle(currname) +
theme(plot.title = element_text(hjust = 0.5,size=20)))
for(i in 1:nrow(corrsp)){
for(j in 1:ncol(corrsp)){
if(corrsp[i,j]>p_thr){next}
pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
c(currname,
rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
)
}
}
# compute correlations among the metadata variables
corrs = cor(curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order = T))
for(n1 in rownames(corrsp)){
for(n2 in rownames(corrsp)){
if(n1==n2){break}
if(n1 %in% biospec_cols &&
n2 %in% biospec_cols) {next}
if(corrsp[n1,n2]>p_thr){next}
metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
)
}
}
}
# Format the reports - for a nicer presentation in a table
colnames(metadata_variable_assoc_report) = c(
"Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
"qc_metric","rho(spearman)","p-value")
pcs_vs_qc_var_report[,5] = format(
as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
as.numeric(metadata_variable_assoc_report[,4]),digits=3)
kable(pcs_vs_qc_var_report,longtable=T,
caption = "Correlations between PCs and other variables") %>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | PC | qc_metric | rho(spearman) | p-value |
|---|---|---|---|---|
| Stanford,PaxGene RNA | PC1 | pct_GC | 0.54730 | 2.61e-05 |
| Stanford,PaxGene RNA | PC2 | registration.sex | -0.78582 | 6.73e-11 |
| Stanford,PaxGene RNA | PC2 | training.staffid | 0.64445 | 4.38e-06 |
| Stanford,PaxGene RNA | PC2 | terminal.weight.mg | -0.71581 | 3.96e-10 |
| Stanford,PaxGene RNA | PC4 | pct_chrM | -0.04221 | 5.69e-05 |
| MSSM,Hippocampus | PC1 | pct_globin | 0.03882 | 1.68e-08 |
| MSSM,Hippocampus | PC5 | pct_chrM | 0.60134 | 3.44e-05 |
| MSSM,Cortex | PC1 | pct_chrM | 0.62220 | 2.59e-08 |
| MSSM,Cortex | PC5 | pct_rRNA | -0.54427 | 7.95e-06 |
| MSSM,Cortex | PC5 | pct_multimapped | -0.66191 | 7.84e-09 |
| MSSM,Hypothalamus | PC1 | pct_chrM | 0.27309 | 1.75e-06 |
| MSSM,Hypothalamus | PC2 | pct_umi_dup | 0.47544 | 3.23e-06 |
| MSSM,Hypothalamus | PC3 | reads | 0.25637 | 5.84e-05 |
| MSSM,Hypothalamus | PC4 | time_to_freeze | 0.29575 | 2.59e-05 |
| Stanford,Gastrocnemius | PC1 | pct_multimapped | 0.60003 | 3.00e-06 |
| Stanford,Gastrocnemius | PC1 | pct_chrM | 0.71390 | 1.04e-08 |
| Stanford,Gastrocnemius | PC1 | registration.sex | 0.74701 | 2.44e-09 |
| Stanford,Gastrocnemius | PC1 | terminal.weight.mg | 0.58048 | 4.13e-07 |
| Stanford,Gastrocnemius | PC2 | pct_multimapped | 0.45067 | 7.16e-05 |
| Stanford,Gastrocnemius | PC2 | pct_GC | -0.66527 | 1.17e-06 |
| Stanford,Gastrocnemius | PC5 | is_control | -0.57862 | 6.68e-06 |
| Stanford,Gastrocnemius | PC5 | timepoint | -0.74686 | 9.20e-08 |
| MSSM,Vastus Lateralis | PC1 | pct_umi_dup | -0.29283 | 3.48e-07 |
| MSSM,Vastus Lateralis | PC3 | pct_multimapped | -0.74944 | 3.86e-09 |
| MSSM,Vastus Lateralis | PC3 | pct_GC | 0.65699 | 1.97e-05 |
| MSSM,Vastus Lateralis | PC3 | pct_chrM | -0.85345 | 6.84e-14 |
| MSSM,Vastus Lateralis | PC4 | registration.sex | 0.84680 | 1.17e-16 |
| MSSM,Vastus Lateralis | PC4 | terminal.weight.mg | 0.70674 | 1.16e-12 |
| Stanford,Heart | PC1 | pct_multimapped | 0.50415 | 4.98e-05 |
| Stanford,Heart | PC1 | pct_chrM | 0.64283 | 3.78e-08 |
| Stanford,Heart | PC1 | registration.sex | 0.86620 | 1.19e-26 |
| Stanford,Heart | PC1 | vo2.max.test.vo2_max_visit1 | -0.59260 | 9.89e-06 |
| Stanford,Heart | PC1 | terminal.weight.mg | 0.76115 | 2.75e-20 |
| Stanford,Heart | PC2 | pct_multimapped | -0.62741 | 1.79e-06 |
| Stanford,Heart | PC2 | pct_GC | 0.79698 | 3.14e-09 |
| Stanford,Heart | PC2 | pct_chrM | -0.56274 | 2.52e-05 |
| Stanford,Heart | PC3 | is_control | -0.61673 | 1.91e-06 |
| Stanford,Heart | PC4 | pct_umi_dup | 0.23018 | 2.37e-05 |
| MSSM,Kidney | PC1 | pct_multimapped | -0.75883 | 7.58e-19 |
| MSSM,Kidney | PC1 | pct_chrM | -0.55066 | 6.24e-08 |
| MSSM,Kidney | PC1 | registration.sex | -0.86620 | 2.08e-32 |
| MSSM,Kidney | PC1 | vo2.max.test.vo2_max_visit1 | 0.55884 | 5.46e-06 |
| MSSM,Kidney | PC1 | terminal.weight.mg | -0.78655 | 1.20e-24 |
| MSSM,Kidney | PC2 | pct_umi_dup | 0.21306 | 6.24e-12 |
| MSSM,Kidney | PC3 | median_5_3_bias | -0.54509 | 1.80e-05 |
| Stanford,Adrenals | PC1 | pct_rRNA | 0.51625 | 3.94e-08 |
| Stanford,Adrenals | PC1 | pct_multimapped | 0.54546 | 1.98e-12 |
| Stanford,Adrenals | PC1 | pct_GC | -0.49879 | 3.28e-10 |
| Stanford,Adrenals | PC1 | pct_chrM | 0.64581 | 2.65e-19 |
| Stanford,Adrenals | PC3 | registration.sex | 0.79690 | 1.10e-11 |
| Stanford,Adrenals | PC3 | vo2.max.test.vo2_max_visit1 | -0.58386 | 2.57e-05 |
| Stanford,Adrenals | PC3 | terminal.weight.mg | 0.64051 | 3.23e-09 |
| Stanford,Colon | PC1 | median_5_3_bias | -0.36819 | 3.01e-05 |
| Stanford,Colon | PC2 | pct_umi_dup | -0.51549 | 9.91e-05 |
| Stanford,Colon | PC2 | median_5_3_bias | 0.48594 | 2.48e-07 |
| Stanford,Colon | PC2 | pct_chrM | 0.51985 | 9.55e-06 |
| Stanford,Colon | PC3 | pct_rRNA | -0.00987 | 1.80e-06 |
| Stanford,Colon | PC3 | pct_umi_dup | -0.03558 | 8.77e-05 |
| Stanford,Colon | PC5 | pct_multimapped | -0.63185 | 8.22e-06 |
| MSSM,Spleen | PC1 | median_5_3_bias | -0.49156 | 3.13e-05 |
| MSSM,Spleen | PC1 | registration.sex | -0.76087 | 8.31e-11 |
| MSSM,Spleen | PC1 | terminal.weight.mg | -0.57092 | 3.86e-08 |
| MSSM,Spleen | PC4 | pct_GC | -0.66563 | 1.56e-06 |
| Stanford,Testes | PC5 | pct_chrM | -0.65320 | 8.66e-05 |
| Stanford,Ovaries | PC5 | pct_umi_dup | -0.66261 | 4.15e-05 |
| Stanford,Aorta | PC1 | pct_globin | -0.51798 | 1.23e-05 |
| Stanford,Aorta | PC1 | pct_multimapped | 0.96281 | 3.48e-33 |
| Stanford,Aorta | PC1 | pct_GC | -0.95267 | 1.61e-26 |
| Stanford,Aorta | PC1 | pct_chrM | 0.97666 | 1.87e-35 |
| Stanford,Aorta | PC1 | training.staffid | 0.69296 | 6.65e-18 |
| Stanford,Aorta | PC1 | time_to_freeze | 0.50612 | 8.70e-17 |
| Stanford,Aorta | PC1 | timepoint | -0.63065 | 2.21e-05 |
| Stanford,Aorta | PC2 | pct_umi_dup | -0.04269 | 9.91e-05 |
| Stanford,Aorta | PC2 | median_5_3_bias | 0.54195 | 1.33e-07 |
| Stanford,Aorta | PC3 | RIN | -0.20513 | 7.32e-07 |
| Stanford,Aorta | PC3 | median_5_3_bias | 0.25621 | 2.12e-05 |
| Stanford,Lung | PC1 | pct_multimapped | 0.71052 | 9.85e-09 |
| Stanford,Lung | PC1 | pct_chrM | 0.66388 | 1.54e-07 |
| Stanford,Lung | PC1 | registration.sex | -0.60287 | 5.38e-06 |
| Stanford,Lung | PC1 | terminal.weight.mg | -0.54110 | 1.64e-05 |
| Stanford,Lung | PC2 | timepoint | -0.53148 | 1.23e-05 |
| Stanford,Lung | PC4 | registration.sex | -0.62782 | 7.91e-07 |
| Stanford,Lung | PC4 | training.staffid | 0.55090 | 6.21e-05 |
| Stanford,Lung | PC4 | terminal.weight.mg | -0.53322 | 5.43e-06 |
| Stanford,Small Intestine | PC1 | pct_multimapped | 0.88727 | 1.54e-16 |
| Stanford,Small Intestine | PC2 | RIN | -0.42656 | 1.71e-05 |
| Stanford,Small Intestine | PC2 | pct_umi_dup | 0.34204 | 1.26e-05 |
| Stanford,Small Intestine | PC4 | training.staffid | 0.62713 | 5.27e-06 |
| MSSM,Liver | PC1 | pct_rRNA | 0.58631 | 6.69e-08 |
| MSSM,Liver | PC1 | pct_multimapped | -0.79569 | 6.73e-34 |
| MSSM,Liver | PC1 | registration.sex | -0.86620 | 1.02e-40 |
| MSSM,Liver | PC1 | training.staffid | 0.59248 | 3.44e-05 |
| MSSM,Liver | PC1 | vo2.max.test.vo2_max_visit1 | 0.52950 | 3.56e-06 |
| MSSM,Liver | PC1 | terminal.weight.mg | -0.72282 | 1.22e-22 |
| MSSM,Brown Adipose | PC1 | pct_globin | -0.33607 | 7.62e-05 |
| MSSM,Brown Adipose | PC1 | pct_multimapped | 0.53886 | 8.24e-12 |
| MSSM,Brown Adipose | PC1 | pct_GC | -0.46636 | 3.25e-09 |
| MSSM,Brown Adipose | PC1 | pct_chrM | 0.63563 | 1.39e-17 |
| MSSM,Brown Adipose | PC3 | registration.sex | -0.74424 | 3.00e-07 |
| MSSM,Brown Adipose | PC3 | terminal.weight.mg | -0.64738 | 1.89e-06 |
| MSSM,Brown Adipose | PC4 | pct_umi_dup | 0.53352 | 1.15e-05 |
| MSSM,White Adipose | PC1 | pct_rRNA | -0.80195 | 3.45e-15 |
| MSSM,White Adipose | PC1 | pct_globin | 0.54231 | 7.01e-06 |
| MSSM,White Adipose | PC1 | pct_multimapped | 0.82213 | 1.28e-14 |
| MSSM,White Adipose | PC1 | registration.sex | -0.86620 | 7.37e-24 |
| MSSM,White Adipose | PC1 | vo2.max.test.vo2_max_visit1 | 0.64783 | 2.76e-07 |
| MSSM,White Adipose | PC1 | terminal.weight.mg | -0.73382 | 2.08e-19 |
| MSSM,White Adipose | PC2 | pct_GC | -0.22447 | 4.34e-14 |
| MSSM,White Adipose | PC3 | pct_umi_dup | -0.05599 | 1.25e-05 |
kable(metadata_variable_assoc_report,longtable=T,
caption = "Correlations between metadata variables")%>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | Var1 | Var2 | rho(spearman) | p-value |
|---|---|---|---|---|
| Stanford,PaxGene RNA | pct_multimapped | pct_rRNA | 0.7935 | 9.87e-13 |
| MSSM,Hippocampus | pct_globin | pct_rRNA | 0.3839 | 2.27e-06 |
| MSSM,Hippocampus | pct_multimapped | pct_rRNA | 0.4360 | 7.47e-06 |
| MSSM,Hippocampus | pct_GC | pct_multimapped | -0.6186 | 3.81e-05 |
| MSSM,Hippocampus | pct_chrM | pct_multimapped | 0.6938 | 4.63e-07 |
| MSSM,Cortex | pct_multimapped | pct_rRNA | 0.8151 | 2.84e-13 |
| MSSM,Cortex | pct_chrM | pct_multimapped | 0.5707 | 1.51e-06 |
| MSSM,Cortex | registration.sex | pct_rRNA | -0.5357 | 4.56e-05 |
| MSSM,Cortex | terminal.weight.mg | pct_rRNA | -0.4760 | 7.79e-05 |
| MSSM,Hypothalamus | pct_multimapped | pct_rRNA | 0.6059 | 7.46e-06 |
| MSSM,Hypothalamus | is_control | pct_umi_dup | 0.5301 | 1.66e-06 |
| MSSM,Hypothalamus | timepoint | pct_umi_dup | 0.6849 | 2.77e-05 |
| Stanford,Gastrocnemius | pct_multimapped | pct_umi_dup | -0.7333 | 1.07e-05 |
| Stanford,Gastrocnemius | pct_GC | pct_umi_dup | 0.5396 | 2.32e-11 |
| Stanford,Gastrocnemius | pct_GC | pct_multimapped | -0.7772 | 7.28e-10 |
| Stanford,Gastrocnemius | pct_chrM | pct_umi_dup | -0.6007 | 4.16e-06 |
| Stanford,Gastrocnemius | pct_chrM | pct_multimapped | 0.9359 | 1.75e-22 |
| Stanford,Gastrocnemius | pct_chrM | pct_GC | -0.7711 | 4.28e-10 |
| Stanford,Gastrocnemius | registration.sex | pct_chrM | 0.5280 | 9.68e-05 |
| Stanford,Gastrocnemius | training.staffid | pct_chrM | -0.5440 | 9.83e-06 |
| MSSM,Vastus Lateralis | median_5_3_bias | pct_globin | -0.6810 | 1.70e-06 |
| MSSM,Vastus Lateralis | pct_GC | pct_multimapped | -0.8530 | 2.10e-18 |
| MSSM,Vastus Lateralis | pct_chrM | pct_multimapped | 0.9376 | 2.39e-28 |
| MSSM,Vastus Lateralis | pct_chrM | pct_GC | -0.8355 | 2.72e-14 |
| MSSM,Vastus Lateralis | is_control | pct_umi_dup | 0.5613 | 3.65e-06 |
| MSSM,Vastus Lateralis | timepoint | pct_umi_dup | 0.7167 | 1.83e-06 |
| Stanford,Heart | pct_GC | pct_multimapped | -0.7236 | 3.62e-08 |
| Stanford,Heart | pct_chrM | pct_multimapped | 0.8542 | 4.21e-15 |
| Stanford,Heart | pct_chrM | pct_GC | -0.7301 | 2.62e-08 |
| Stanford,Heart | registration.sex | pct_rRNA | 0.5257 | 9.68e-05 |
| Stanford,Heart | registration.sex | pct_multimapped | 0.5753 | 1.82e-05 |
| Stanford,Heart | registration.sex | pct_chrM | 0.7498 | 1.06e-09 |
| Stanford,Heart | terminal.weight.mg | pct_chrM | 0.5948 | 1.95e-07 |
| MSSM,Kidney | pct_GC | pct_multimapped | -0.5793 | 4.74e-05 |
| MSSM,Kidney | pct_chrM | pct_multimapped | 0.8542 | 6.66e-15 |
| MSSM,Kidney | registration.sex | pct_multimapped | 0.8668 | 5.83e-21 |
| MSSM,Kidney | registration.sex | pct_chrM | 0.7526 | 5.36e-10 |
| MSSM,Kidney | vo2.max.test.vo2_max_visit1 | pct_multimapped | -0.5917 | 2.85e-06 |
| MSSM,Kidney | vo2.max.test.vo2_max_visit1 | pct_chrM | -0.5739 | 2.82e-05 |
| MSSM,Kidney | terminal.weight.mg | pct_multimapped | 0.7903 | 2.55e-17 |
| MSSM,Kidney | terminal.weight.mg | pct_chrM | 0.6760 | 4.89e-09 |
| Stanford,Adrenals | pct_multimapped | pct_rRNA | 0.4597 | 2.00e-06 |
| Stanford,Adrenals | pct_GC | pct_rRNA | -0.3747 | 5.34e-05 |
| Stanford,Adrenals | pct_GC | pct_multimapped | -0.9350 | 2.15e-24 |
| Stanford,Adrenals | pct_chrM | pct_rRNA | 0.4816 | 1.99e-07 |
| Stanford,Adrenals | pct_chrM | pct_multimapped | 0.9486 | 1.64e-29 |
| Stanford,Adrenals | pct_chrM | pct_GC | -0.9156 | 1.38e-21 |
| Stanford,Adrenals | registration.sex | pct_GC | 0.6401 | 4.91e-05 |
| Stanford,Adrenals | terminal.weight.mg | pct_GC | 0.5400 | 9.51e-05 |
| Stanford,Colon | pct_umi_dup | pct_rRNA | 0.2108 | 8.62e-06 |
| Stanford,Colon | pct_umi_dup | pct_globin | 0.0360 | 1.16e-06 |
| Stanford,Colon | median_5_3_bias | RIN | 0.5090 | 1.87e-06 |
| Stanford,Colon | median_5_3_bias | pct_globin | -0.2791 | 8.81e-05 |
| Stanford,Colon | median_5_3_bias | pct_umi_dup | -0.5355 | 1.12e-05 |
| Stanford,Colon | pct_chrM | pct_GC | -0.7619 | 1.22e-10 |
| MSSM,Spleen | pct_globin | pct_rRNA | 0.6747 | 5.49e-08 |
| MSSM,Spleen | pct_multimapped | pct_rRNA | 0.5714 | 4.94e-09 |
| MSSM,Spleen | pct_multimapped | pct_globin | 0.3904 | 4.81e-05 |
| MSSM,Spleen | pct_GC | pct_rRNA | 0.4826 | 8.29e-07 |
| MSSM,Spleen | pct_chrM | median_5_3_bias | -0.5517 | 7.64e-07 |
| MSSM,Spleen | pct_chrM | pct_GC | -0.1973 | 2.74e-06 |
| MSSM,Spleen | training.staffid | median_5_3_bias | -0.5544 | 9.25e-06 |
| MSSM,Spleen | training.staffid | pct_chrM | 0.5683 | 2.40e-06 |
| MSSM,Spleen | terminal.weight.mg | pct_globin | -0.5804 | 3.27e-05 |
| Stanford,Aorta | pct_umi_dup | RIN | -0.4258 | 1.36e-11 |
| Stanford,Aorta | median_5_3_bias | RIN | 0.0963 | 9.43e-08 |
| Stanford,Aorta | pct_multimapped | pct_globin | -0.5154 | 4.63e-05 |
| Stanford,Aorta | pct_GC | pct_globin | 0.5006 | 9.05e-05 |
| Stanford,Aorta | pct_GC | pct_multimapped | -0.9760 | 9.96e-35 |
| Stanford,Aorta | pct_chrM | pct_globin | -0.5600 | 2.75e-05 |
| Stanford,Aorta | pct_chrM | pct_multimapped | 0.9796 | 7.53e-41 |
| Stanford,Aorta | pct_chrM | pct_GC | -0.9712 | 6.70e-37 |
| Stanford,Aorta | training.staffid | pct_globin | -0.6208 | 9.80e-06 |
| Stanford,Aorta | training.staffid | pct_multimapped | 0.6930 | 1.60e-14 |
| Stanford,Aorta | training.staffid | pct_GC | -0.6995 | 2.54e-13 |
| Stanford,Aorta | training.staffid | pct_chrM | 0.6930 | 8.33e-14 |
| Stanford,Aorta | time_to_freeze | pct_globin | -0.7112 | 4.92e-06 |
| Stanford,Aorta | time_to_freeze | pct_multimapped | 0.5392 | 1.51e-13 |
| Stanford,Aorta | time_to_freeze | pct_GC | -0.5218 | 4.67e-12 |
| Stanford,Aorta | time_to_freeze | pct_chrM | 0.5394 | 7.87e-13 |
| Stanford,Aorta | timepoint | pct_multimapped | -0.5989 | 1.56e-05 |
| Stanford,Aorta | timepoint | pct_GC | 0.6194 | 3.56e-06 |
| Stanford,Aorta | timepoint | pct_chrM | -0.6121 | 1.48e-05 |
| Stanford,Lung | pct_rRNA | RIN | -0.1516 | 3.94e-08 |
| Stanford,Lung | pct_chrM | pct_GC | -0.5574 | 1.57e-05 |
| Stanford,Lung | registration.sex | pct_chrM | -0.5904 | 3.22e-05 |
| Stanford,Lung | terminal.weight.mg | pct_chrM | -0.5088 | 6.87e-05 |
| Stanford,Small Intestine | median_5_3_bias | pct_umi_dup | -0.5035 | 5.71e-06 |
| Stanford,Small Intestine | pct_chrM | pct_multimapped | 0.6780 | 5.83e-08 |
| Stanford,Small Intestine | pct_chrM | pct_GC | -0.7625 | 1.38e-08 |
| MSSM,Liver | median_5_3_bias | pct_globin | -0.5922 | 4.30e-05 |
| MSSM,Liver | pct_multimapped | pct_rRNA | -0.5459 | 4.59e-07 |
| MSSM,Liver | registration.sex | pct_rRNA | -0.7922 | 6.49e-09 |
| MSSM,Liver | registration.sex | pct_multimapped | 0.8667 | 4.39e-37 |
| MSSM,Liver | vo2.max.test.vo2_max_visit1 | pct_multimapped | -0.6498 | 1.15e-06 |
| MSSM,Liver | terminal.weight.mg | pct_rRNA | -0.6908 | 5.96e-08 |
| MSSM,Liver | terminal.weight.mg | pct_multimapped | 0.7452 | 3.21e-23 |
| MSSM,Brown Adipose | pct_GC | pct_multimapped | -0.8690 | 1.82e-22 |
| MSSM,Brown Adipose | pct_chrM | pct_multimapped | 0.9219 | 1.44e-27 |
| MSSM,Brown Adipose | pct_chrM | pct_GC | -0.8310 | 2.79e-20 |
| MSSM,White Adipose | median_5_3_bias | pct_globin | -0.7089 | 3.42e-06 |
| MSSM,White Adipose | pct_multimapped | pct_rRNA | -0.6730 | 1.05e-08 |
| MSSM,White Adipose | pct_multimapped | pct_globin | 0.6009 | 2.71e-06 |
| MSSM,White Adipose | pct_chrM | RIN | -0.6459 | 8.45e-09 |
| MSSM,White Adipose | registration.sex | pct_rRNA | 0.9057 | 2.21e-19 |
| MSSM,White Adipose | registration.sex | pct_globin | -0.5366 | 8.92e-05 |
| MSSM,White Adipose | registration.sex | pct_multimapped | -0.7500 | 1.68e-09 |
| MSSM,White Adipose | vo2.max.test.vo2_max_visit1 | pct_rRNA | -0.6376 | 1.75e-06 |
| MSSM,White Adipose | vo2.max.test.vo2_max_visit1 | pct_multimapped | 0.5396 | 6.46e-05 |
| MSSM,White Adipose | terminal.weight.mg | pct_rRNA | 0.7810 | 1.44e-15 |
| MSSM,White Adipose | terminal.weight.mg | pct_globin | -0.4920 | 5.95e-05 |
| MSSM,White Adipose | terminal.weight.mg | pct_multimapped | -0.7044 | 7.62e-10 |
In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.
Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples. Moreover, note that we plot outliers for the rii (raw intensitied) data as well. Typically, outliers from such datasets will not appear in the bettern normalized ratio data.
pca_outliers_report = c()
for(currname in names(norm_rnaseq_data)){
curr_pcax = curr_pcax = norm_rnaseq_data[[currname]][["pca"]][["pcax"]]
# Univariate: use IQRs
pca_outliers = c()
for(j in 1:num_pcs_for_outlier_analysis){
outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
for(outlier in names(outlier_values)){
pca_outliers_report = rbind(pca_outliers_report,
c(currname,paste("PC",j,sep=""),outlier,
format(outlier_values[outlier],digits=5))
)
if(!is.element(outlier,names(pca_outliers))){
pca_outliers[outlier] = outlier_values[outlier]
}
}
}
# Plot the outliers
if(length(pca_outliers)>0){
# print(length(kNN_outliers))
df = data.frame(curr_pcax,
outliers = rownames(curr_pcax) %in% names(pca_outliers))
col = rep("black",nrow(df))
col[df$outliers] = "green"
plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"\nflagged outliers"),
xlab = "PC1",ylab="PC2")
plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"\nflagged outliers"),
xlab = "PC3",ylab="PC4")
}
}
if(!is.null(dim(pca_outliers_report))){
colnames(pca_outliers_report) = c("dataset","PC","sample","score")
}
We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.
# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(rnaseq_meta$pct_chrX,rnaseq_meta$pct_chrY)
rownames(sex_read_info) = rownames(rnaseq_meta)
sex_check_outliers = c()
for(currname in names(norm_rnaseq_data)){
# 1 is female
curr_sex = norm_rnaseq_data[[currname]]$dmaqc_meta$registration.sex
read_info = sex_read_info[
rownames(norm_rnaseq_data[[currname]]$dmaqc_meta),]
df = data.frame(sex = as.character(curr_sex),
pct_chrX = read_info[,1],
pct_chrY = read_info[,2],stringsAsFactors = F)
df$sex[df$sex=="1"] = "F"
df$sex[df$sex=="2"] = "M"
if(length(table(df$sex))==1){
print(paste("Cannot run sex check in dataset:",currname))
print("There are no data points from both classes, skipping")
next
}
df = df[!apply(is.na(df[,1:2]),1,any),]
if(nrow(df)<5){
print(paste("Cannot run sex check in dataset:",currname))
print("Too many missing values in pct_chrY and pct_chrX, skipping")
next
}
p = ggplot(df) +
geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
ggtitle(paste0(currname," - sex check"))
plot(p)
train_control <- trainControl(method = "cv", number = nrow(df),
savePredictions = TRUE)
# train the model on training set
model <- train(sex ~ .,data = df,
trControl = train_control,
method = "glm", family=binomial(link="logit"))
# CV redults
cv_res = model$pred
cv_errors = cv_res[,1] != cv_res[,2]
err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
for(samp in err_samples){
sex_check_outliers = rbind(sex_check_outliers,
c(currname,samp))
}
}
## [1] "Cannot run sex check in dataset: Stanford,Testes"
## [1] "There are no data points from both classes, skipping"
## [1] "Cannot run sex check in dataset: Stanford,Ovaries"
## [1] "There are no data points from both classes, skipping"
if(! is.null(dim(sex_check_outliers))){
colnames(sex_check_outliers) = c("dataset","sample")
}
# add the dataset name to mop outliers
sample2dataset = paste(tolower(rnaseq_meta[,c("Tissue")]),
tolower(rnaseq_meta[,c("GET_site")]),sep=",")
names(sample2dataset) = rownames(rnaseq_meta)
all_flagged = NULL
if(length(pca_outliers_report)>0){
all_flagged = union(all_flagged,pca_outliers_report[,"sample"])
}
if(length(sex_check_outliers)>0){
all_flagged = union(all_flagged,sex_check_outliers[,"sample"])
}
all_flagged = union(all_flagged,mop_flagged_samples)
flagged_sample_report = c()
for(samp in all_flagged){
samp_dataset = sample2dataset[samp]
samp_metric = NULL
if(samp %in% mop_flagged_samples){
samp_metric = rnaseq_meta[samp,"pipeline_flags"]
}
if(!is.null(dim(sex_check_outliers)) &&
samp %in% sex_check_outliers[,"sample"]){
samp_metric = c(samp_metric,"Sex-flagged")
}
if(!is.null(dim(pca_outliers_report)) &&
samp %in% pca_outliers_report[,"sample"]){
curr_pcs = pca_outliers_report[pca_outliers_report[,"sample"]==samp,"PC"]
samp_metric = c(samp_metric,curr_pcs)
}
flagged_sample_report = rbind(flagged_sample_report,
c(samp,samp_dataset,paste(samp_metric,collapse=","))
)
}
colnames(flagged_sample_report) = c("Viallabel","Dataset","Methods")
kable(flagged_sample_report,longtable=T,
caption = "Outliers detected, MOP, Sex check, and by tissue PCA data")%>%
kable_styling(font_size = 8,latex_options = c("hold_position", "repeat_header"))
| Viallabel | Dataset | Methods |
|---|---|---|
| 90222013002 | paxgene rna,stanford | Sex-flagged,PC1 |
| 90406015202 | hippocampus powder,mssm | PC1 |
| 90406015402 | hypothalmus powder,mssm | PC1 |
| 90252015402 | hypothalmus powder,mssm | PC3 |
| 90252015603 | vastus lateralis powder,mssm | PC1,PC2 |
| 90585015603 | vastus lateralis powder,mssm | PC1 |
| 90217015603 | vastus lateralis powder,mssm | PC3 |
| 90444015902 | kidney powder,mssm | PC2 |
| 90571016002 | adrenal powder,stanford | PC1 |
| 90266016002 | adrenal powder,stanford | PC1 |
| 90422016002 | adrenal powder,stanford | PC1 |
| 90426016002 | adrenal powder,stanford | PC1 |
| 90229016102 | colon powder,stanford | PC3 |
| 90581016402 | ovaries powder,stanford | PC1 |
| 90576016402 | ovaries powder,stanford | PC1 |
| 90571016402 | ovaries powder,stanford | PC1 |
| 90406016502 | aorta powder,stanford | PC3 |
| 90441016702 | small intestine powder,stanford | PC2 |
| 90218016803 | liver powder,mssm | PC2 |
| 90266016902 | brown adipose powder,mssm | PC1 |
| 90245017005 | white adipose powder,mssm | PC2 |
| 90423017005 | white adipose powder,mssm | PC2,PC3 |
| 90564017005 | white adipose powder,mssm | PC2 |
| 90410017005 | white adipose powder,mssm | PC3 |
| 90251013001 | paxgene rna,stanford | Sex-flagged |
| 90441016102 | colon powder,stanford | RIN<6 |
| 90245013002 | paxgene rna,stanford | NumReads<20M |
| 90218013001 | paxgene rna,stanford | NumReads<20M |
# use the raw bucket names as the output location
# (implied assumption: rna_seq_data and norm_rnaseq_data have the same order)
get_out_bucket_from_local_file_name<-function(b,input_bucket,shift_size=3){
arr = strsplit(b,split="/")[[1]]
n = length(arr)
local_b = paste(arr[(n-shift_size):(n-1)],collapse="/")
out_bucket = paste0(input_bucket,"/",local_b,"/")
out_bucket = gsub("//","/",out_bucket)
out_bucket = gsub("gs:/+","gs://",out_bucket)
return(out_bucket)
}
out_buckets = sapply(rna_seq_data$data_table_paths,
get_out_bucket_from_local_file_name,input_bucket=bucket)
for(i in 1:length(norm_rnaseq_data)){
out_b = out_buckets[i]
x = norm_rnaseq_data[[i]]$norm_counts
x_bids = pheno_data$viallabel_data[colnames(x),"bid"]
x_pids = pheno_data$viallabel_data[colnames(x),"pid"]
newx = cbind(rownames(x),x)
rownames(newx) = NULL
for(j in 2:ncol(newx)){
newx[,j] = round(as.numeric(newx[,j]),digits=5)
}
colnames(newx)[1] = "geneid"
newx = rbind(c("bid",x_bids),newx)
newx = rbind(c("pid",x_pids),newx)
newx = rbind(c("viallabel",colnames(x)),newx)
colnames(newx) = NULL
# a simple sanity check
m = newx[-c(1:3),-1]
mode(m) = "numeric"
if(!all(abs(m-x)< 1e-05)){
print("Error in parsing the matrix, breaking")
break
}
# save the output to the target bucket
curr_t = names(rna_seq_data$data_tables)[i]
curr_t = tolower(gsub("_","-",curr_t))
local_fname = paste0(local_data_dir,
"motrpac_pass1b-06_",curr_t,"_transcript-rna-seq_",
"normalized-log-cpm.txt")
fwrite(newx,file=local_fname,sep="\t",quote=F,
row.names = F,col.names = F)
cmd = paste(gsutil_cmd,"cp",local_fname,out_b)
system(cmd)
}
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table